# author: zhanghao
# time: 2023-12-28
# info: 利用PP飞奖PaddleOcr处理pdf和图片
# 安装命令
import os.path
import threading

from fitz import fitz
from paddleocr import PaddleOCR, PPStructure

from helper.file_helper import FileHelper
from core.ocr_result import OcrResult
from config.logger import Logger


# PP飞奖安装参考：https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.7/doc/doc_ch/quickstart.md
class PaddleOcr:
    logger = Logger().get()
    paddle_ocr = PaddleOCR(use_angle_cls=True, lang="ch")
    table_engine = PPStructure(recovery=True, lang='ch')

    # 中间结果文件夹
    base_dir_name = 'ocrCacheDir'

    cache_dir_path = ''

    def __init__(self):
        # 创建识别中间结果文件夹
        current_path = os.getcwd()
        self.cache_dir_path = current_path + '/' + self.base_dir_name
        if not os.path.exists(self.cache_dir_path):
            os.makedirs(self.cache_dir_path)

    def getTempDir(self):
        folder_path = self.cache_dir_path
        thread_id = threading.get_ident()
        temp_dir = folder_path + '/' + str(thread_id)
        if not os.path.exists(temp_dir):
            os.makedirs(temp_dir)
        return temp_dir

    # 核心处理pdf函数
    def processPdf(self, pdf_path):
        return self.pdfOcr(pdf_path)

    # 核心处理图片函数
    def processImg(self, img_path):
        text_line = self.img2text(img_path)
        ocr_result = OcrResult(text_line)
        return ocr_result

    def pdfOcr(self, pdf_path):
        self.logger.info('开始处理pdf文件: ' + pdf_path)

        self.pdf2img(pdf_path)
        ocr_result = self.processImgPath()

        return ocr_result

    def pdf2img(self, pdf_path):
        pdf_doc = fitz.open(pdf_path)

        self.logger.info('pdf2img: ' + str(pdf_doc.page_count))
        for idx in range(pdf_doc.page_count):
            page = pdf_doc[idx]
            rotate = int(0)
            zoom_x = 2
            zoom_y = 2
            mat = fitz.Matrix(zoom_x, zoom_y).prerotate(rotate)
            pix = page.get_pixmap(matrix=mat, alpha=False)
            self.logger.info('save:' + self.getTempDir() + '/' + f'image_{idx + 1}.png')
            pix.save(self.getTempDir() + '/' + f'image_{idx + 1}.png')

        self.logger.info('pdf文件保存图片成功，数量：' + str(pdf_doc.page_count))

    def processImgPath(self):
        images = FileHelper.fetchDirFile(self.getTempDir())
        file_names = sorted(images, key=lambda x: x.lower())
        self.logger.info(file_names)

        text_list = []
        for img_file in file_names:
            img_path = os.path.join(self.getTempDir(), img_file)

            text_list.extend(self.img2text(img_path))
        # 按照规则配置重命名文件
        ocr_result = OcrResult(text_list)

        return ocr_result

    def img2text(self, img_path):
        text_line = []
        if os.path.exists(img_path):
            result = self.paddle_ocr.ocr(img_path, cls=True)

            for line in result:
                for res in line:
                    text_line.append(res[1][0])

        return text_line
